Filter criteria:
Filter differentially expressed genes between autism and control (p-value < 0.05)
No samples are removed based on network connectivity z-scores
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 6417
## Number of samples: 86 (51 ASD, 35 controls)
First principal component explains over 90% of the total variance
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = data.frame(datExpr)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 12 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)
Genes seem to have separated into two clouds of points:
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')
# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]
# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
ASD_pvals = fit$p.value[,'DxASD']
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
rm(mod, corfit, lmfit, fit, ASD_pvals, datExpr, datProbes, datSeq)
clusterings = list()
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=6 as best number of clusters.
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.
h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 5
clusterings[['hc']] = cutree(h_clusts, best_k)
Samples are grouped into two big clusters and then each cluster in 4 and 6 subclusters, respectively. The first big separation into two clusters is very clear, the subclusters not so much.
*Output plots in clustering_genes_03_15 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.001 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
Leaves most of the observations (~75%) without a cluster:
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 5 6 7
## 4851 991 390 131 37 13 3 1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
best_power=40 but blockwiseModules only accepts powers up to 30, so 30 was used instead
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(30, 50, by=1))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 30 0.637 -0.320 0.951 817 747 2090
## 2 31 0.675 -0.329 0.955 792 713 2050
## 3 32 0.700 -0.337 0.954 768 683 2020
## 4 33 0.724 -0.348 0.954 745 653 1980
## 5 34 0.745 -0.356 0.947 723 625 1950
## 6 35 0.774 -0.368 0.946 702 597 1920
## 7 36 0.793 -0.378 0.948 683 571 1890
## 8 37 0.808 -0.386 0.943 664 548 1860
## 9 38 0.823 -0.394 0.946 646 525 1830
## 10 39 0.837 -0.401 0.948 628 502 1800
## 11 40 0.857 -0.410 0.954 612 481 1770
## 12 41 0.863 -0.420 0.950 596 463 1750
## 13 42 0.874 -0.427 0.951 581 445 1720
## 14 43 0.878 -0.437 0.949 566 427 1700
## 15 44 0.890 -0.444 0.950 552 411 1670
## 16 45 0.894 -0.452 0.947 539 395 1650
## 17 46 0.892 -0.460 0.936 526 380 1630
## 18 47 0.897 -0.470 0.935 513 365 1610
## 19 48 0.901 -0.478 0.930 501 352 1580
## 20 49 0.909 -0.487 0.936 489 339 1560
## 21 50 0.916 -0.493 0.938 478 326 1540
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It leaves 476 genes without a cluster:
table(clusterings[['WGCNA']])
##
## 0 1 2 3 4 5 6 7 8 9
## 476 3651 1350 314 294 180 50 48 27 27
Number of clusters that resemble more Gaussian mixtures = 32 but perhaps a smaller number could also work (maybe 5, 12 or 17)
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 32
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.
manual_clusters = as.factor(as.numeric(0.08*datExpr_redDim$PC1 + 0.2 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() +
geom_abline(slope=0.08, intercept=0.2, color='gray') + theme_minimal()
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data)
Using Adjusted Rand Index:
Clusterings seem to give very different results and none resembles the manual separation
Simple methods give similar results (K-means, hierarchical clustering, consensus clustering)
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
create_2D_plot = function(cat_var){
ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) +
geom_point() + theme_minimal() +
xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>%
layout(title = glue('Samples coloured by ', cat_var),
scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))
}
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), km_clust = as.factor(clusterings[['km']]),
hc_clust = as.factor(clusterings[['hc']]), cc_l1_clust = as.factor(clusterings[['cc_l1']]),
cc_clust = as.factor(clusterings[['cc_l2']]), ica_clust = as.factor(clusterings[['ICA_min']]),
n_ica_clust = as.factor(rowSums(ICA_clusters)), gmm_clust = as.factor(clusterings[['GMM']]),
wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]))
2D plots of clusterings
Simple methods seem to only partition the space in buckets using information from the first component
ICA seems to be the only one to distinguish between the two clouds of points, although it is really noisy
WGCNA creates clusters inverted between clouds
create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('wgcna_clust')
3D plots
create_3D_plot('ica_clust')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors